########################################################################
####              This script creates the Release and Crime         ####
####                versus Predicted Risk plots shown               ####
####                            in Figure II.                       ####
########################################################################
require(plyr)
require(dplyr)
require(caret)
require(ggplot2)
require(pROC)

### load total training set
df <- read.csv('bail_dataset.csv')

##### 1. Divide into Train / Test / Impute #####
set.seed(1234567890)

###
# set aside 20% for test
test.ix     <- createDataPartition(df$fta, p = .2, list = F)
test        <- df[test.ix,]

# remove test from rest
df          <- df[-test.ix,]

# split the remaining half
train.ix    <- createDataPartition(df$fta, p = .5, list = F)

train       <- df[train.ix, ]
impute      <- df[-train.ix,]

# releases
train.rel   <- which(train$released  == 'Definitely')
impute.rel  <- which(impute$released == 'Definitely')  

#### 2. Model Feature Selection ####
Xs <- c('arr_age','arr_counts','arr_county','arr_mo',
        'arr_charge_category','arr_class_fac', 
        'arr_vfo', 'cycle_cc_dv', 'arr_firearm', 'arr_weapon', 
        'arr_minors', 'arr_hate', 'arr_drug', 'arr_1192', 
        'p_ftas',
        'p_felarr', 'p_misdarr',
        'p_vfoarr', 'p_drugarr', 'p_minorsarr',
        'p_weparr', 'p_soarr', 'p_vtl','p_dwi','p_firearm',
        'p_felcon', 'p_misdcon', 'p_vfocon',
        'p_drugcon', 'p_drugfelcon','p_drugmisdcon','p_minorscon',
        'p_wepcon', 'p_socon', 'p_vtlcon','p_dwifelcon','p_dwimisdcon','p_firearmcon'
)

options(na.action = 'na.pass')

#### 3. Predict ####
### combine the test and imputation sets
test <- test[,names(impute)]
test <- rbind(test, impute)

# make model matrices
train.mat     <- model.matrix(~ ., train[,Xs])[,-1]
impute.mat    <- model.matrix(~ ., impute[,Xs])[,-1]
test.mat      <- model.matrix(~ ., test[,Xs])[,-1]

# read in models
train.fit     <- readRDS('output/models/training_model')
impute.fit    <- readRDS('output/models/imputation_model')

test$phat     <- predict(train.fit,  newdata = test.mat, type = 'prob')[,2]
test$impute   <- predict(impute.fit, newdata = test.mat, type = 'prob')[,2]

############################################################
####  Assign cases to quintiles based on judge leniency ####
############################################################

df$county <- factor(df$arr_county, labels = c('__','BX','BK','MN','QN','SI'))

df$year   <- format(df$larg_date, '%Y')
df$month  <- months(df$larg_date)
df$day    <- weekdays(df$larg_date)
df$day    <- factor(df$day, #### this fixes the order
                    levels = c('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday')) 

df$first_arraign_part <- as.factor(df$first_arraign_part)

df$cell               <- interaction(df$county, df$first_arraign_part, df$day, df$year)
df$cell               <- factor(df$cell)
df$rel                <- as.numeric(df$released == 'Definitely')

test                  <- merge(test, df)
test                  <- data.table(test, key = 'cell') # using data.table will make this much faster

cell_sum              <- summarise(group_by(test, cell), cases = n(), judges = n_distinct(JUDGESCRAMBLED), rel = mean(rel)) %>% arrange(desc(cases))
cells_with_five       <- cell_sum$cell[which(cell_sum$judges >= 5)]

quintile_list         <- vector(mode = 'list', length = length(cells_with_five))
names(quintile_list)  <- cells_with_five

for (c in 1:length(cells_with_five)) {
  temp                <- test[cell == names(quintile_list)[c]]
  temp_judge          <- summarise(group_by(temp, cell, JUDGESCRAMBLED), jdg.rel = mean(rel), cases = n()) %>% arrange(jdg.rel)
  temp_judge$cum      <- cumsum(temp_judge$cases)/sum(temp_judge$cases)
  temp_judge$quintile <- cut(temp_judge$cum, breaks = seq(0, 1, .2), include.lowest = T, labels = F)
  
  # this can leave holes, as it does in cell 1 
  # fix that
  for (missing in setdiff(1:5, temp_judge$quintile)) {
    if (missing == 5) {temp_judge$quintile[nrow(temp_judge)] <- missing}
    else {temp_judge$quintile[min(which(temp_judge$quintile == missing + 1))] <- missing}
  }
  
  quintile_list[[c]] <- temp_judge %>% select(cell, JUDGESCRAMBLED, quintile)
} 

quintile_table      <- bind_rows(quintile_list)

test                <- as.data.frame(test)
quintile_table      <- as.data.frame(quintile_table)

test                <- merge(test, quintile_table)
test                <- data.table(test, key = c('cell', 'quintile'))

####################################################
####  Functions to generate contraction results ####
####################################################

# function to generate the contracted crime rates using a user specified predictor and outcome
contract <- function(pp_to_det = 1:10, prediction = 'phat', outcome = 'fta') {
  # function of number to contract given in percentage points and a character string
  # specifying the prediction to use to detain people
  pp_to_det       <- round(pp_to_det, 2)
  
  # set up an empty dataframe of the appropriate dimensions to write the contraction results into
  empty           <- cbind(cell = cell_sum$cell, as.data.frame(matrix(nrow = nrow(cell_sum), ncol = 1 + length(pp_to_det))))
  names(empty)    <- c('cell', paste('q5', outcome, sep = '_'), paste(outcome, 'pct', pp_to_det, sep = '_'))
  
  test$temp_pred <- test[,prediction]
  
  for (i in 1:nrow(cell_sum)){
    temp    <- filter(test, cell == cell_sum$cell[i] & quintile == 5)
    
    empty[i, paste('q5', outcome, sep = '_')] <- sum(temp[, outcome])
    
    one_pct <- nrow(temp)/100
    temp    <- filter(temp, released == 'Definitely') %>% arrange(desc(temp_pred))
    
    for (pp in pp_to_det) {
      empty[i, paste(outcome, 'pct', pp, sep ='_')] <- sum(temp[1:round(one_pct*pp, 0), outcome])
    }
  } 
  test$temp_pred <- NULL
  empty[, grep('pct', names(empty))] <- empty[, grep('pct', names(empty))]/empty[, paste('q5', outcome, sep = '_')]
  
  results <- data.frame(pct = substr(colnames(empty)[-(1:2)], 
                                     start = nchar(outcome) + 6, stop = nchar(colnames(empty)[-(1:2)])),
                        crime_reduction = sapply(empty[, -(1:2)], 
                                                 function(x) {weighted.mean(x, w = cell_sum$cell.cases, na.rm = T)}),
                        stringsAsFactors = F)
  results[1 + nrow(results), ] <- c(0, 0)
  return(results)
}

# function to generate the empirical judge points
empirical_points <- function(outcome = 'fta') {  
  
  test$outcome    <- test[, outcome]
  
  t_cell_sum      <- summarise(group_by(test, cell), cell.cases = n(), boro = unique(county))
  t_empirical_sum <- summarise(group_by(test, cell, quintile), 
                               cases = n(), rel.rate = mean(rel),
                               outcome.rate = mean(outcome), outcome.n = sum(outcome)) 
  
  t_q5_sum        <- filter(t_empirical_sum, quintile == 5)
  t_q5_sum        <- merge(t_q5_sum, t_cell_sum[,1:2])
  
  t_empirical_sum                 <- merge(t_cell_sum, t_empirical_sum)
  t_empirical_sum                 <- merge(t_empirical_sum, t_q5_sum, by = 'cell', all.x = T, suffixes = c('', '.q5'))
  t_empirical_sum                 <- dplyr::arrange(t_empirical_sum, desc(cell.cases), desc(quintile))
  
  ##### difference in detention and crime rate
  t_empirical_sum$det.rate        <- 1 - t_empirical_sum$rel.rate
  t_empirical_sum$det.rate.q5     <- 1 - t_empirical_sum$rel.rate.q5
  
  t_empirical_sum$delta_det       <- t_empirical_sum$det.rate - t_empirical_sum$det.rate.q5
  t_empirical_sum$delta_outcome   <- (t_empirical_sum$outcome.rate - t_empirical_sum$outcome.rate.q5)/t_empirical_sum$outcome.rate.q5
  
  
  plot_data_emp <- filter(t_empirical_sum, outcome.rate.q5 > 0) %>% 
    group_by(quintile) %>%
    summarise(X = weighted.mean(delta_det,     w = cell.cases, na.rm = T),
              Y = weighted.mean(delta_outcome, w = cell.cases, na.rm = T)) %>% arrange(quintile)
  
  return(plot_data_emp)
}

####################################
####    Create the Basic Plot   ####
####################################
empirical_points  <- empirical_points()

# ML Contracted results
ml_contraction    <- contract(pp_to_det = 1:23)

# Randomly Contracted Results
# generate a random value to use as a prediction
test$rnd          <- runif(nrow(test))
rnd_contraction   <- contract(pp_to_det = 1:23, prediction = 'rnd')

# final plot
ggplot(empirical_points, aes(x = -1*X, y = Y, col = 'Judges')) + 
  geom_point(size = 2) + 
  geom_line(data = ml_contraction,   aes(x = -as.numeric(pct)/100, y = -1*crime_reduction, col = 'Lenient Judge + ML')) +
  geom_line(data = rnd_contraction,  aes(x = -as.numeric(pct)/100, y = -1*crime_reduction, col = 'rand'), lty = 2)  

